# import the csv with the articles
PP_NYT <- read.csv("/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Data/PP_NYT.csv")
# subset the variables that are meaningful, create new dataset to work with
pp <- subset(PP_NYT, select=c(DATE, TITLE, LENGTH, GRAPHIC, SECTION, BYLINE, DATELINE, TEXT))
names(pp)
## [1] "DATE" "TITLE" "LENGTH" "GRAPHIC" "SECTION" "BYLINE"
## [7] "DATELINE" "TEXT"
# create a new date variable that will represent the article date in YYYY-MM-DD format
date <- as.character(pp$DATE)
betterDates <- as.Date(date, format = "%B %d, %Y")
pp$date.num <- betterDates
# create a variable for just the year each article was published
year = NULL
pp$year <- substr(pp$date.num, 1, 4)
# create a variable for the year and month each article was published
yearmonth = NULL
pp$yearmonth <- substr(pp$date.num, 1, 7)
Now we’re going to look at the polarity of the articles on Planned Parenthood, as it changes over time. Though we’re using this analysis on Planned Parenthood articles, we could really use this on any corpus, to analyze how the polarity of a set of documents have changed over a given variable—be it time, from document to document, by person, and so on.
set.seed(1234)
# load the libraries we will need for this section
library(mallet) # a wrapper around the Java machine learning tool MALLET
## Loading required package: rJava
library(wordcloud) # to visualize wordclouds
## Loading required package: RColorBrewer
# subset the data for 2009 and later, for an initial analysis of the topic models
pp.2010 <- subset(pp, pp$year > 2009)
# we first have to create an 'id' column
pp.2010$id <- rownames(pp.2010)
# remove punctuation
pp.2010$TEXT <- gsub(pattern="[[:punct:]]",replacement=" ",pp.2010$TEXT)
# load data into mallet
mallet.instances <- mallet.import(pp.2010$id, pp.2010$TEXT, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Data/stoplist.csv", FALSE, token.regexp="[\\p{L}']+")
# choose the number of topics to model
n.topics = 10
# create a topic trainer object
topic.model <- MalletLDA(n.topics)
# load the documents
topic.model$loadDocuments(mallet.instances)
# get the vocabulary, and some statistics about word frequencies; after running this code once through, i went back and re-curated the stop word lists, to remove some of the more frequently used words that weren't otherwise caught
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
# examine some of the vocabulary
word.freqs[1:50,]
## words term.freq doc.freq
## 1 weight 3 2
## 2 political 1019 478
## 3 universe 15 13
## 4 shoulders 13 13
## 5 candidates 662 253
## 6 running 315 203
## 7 senate 984 315
## 8 seat 138 69
## 9 long 638 407
## 10 held 354 270
## 11 edward 37 36
## 12 kennedy 117 58
## 13 embarked 8 8
## 14 frenzied 3 3
## 15 day 668 380
## 16 campaigning 43 36
## 17 monday 301 188
## 18 groups 550 302
## 19 sides 138 111
## 20 health 2165 678
## 21 care 1324 563
## 22 debate 938 353
## 23 flooded 10 8
## 24 state 2436 621
## 25 money 831 370
## 26 advertisements 30 26
## 27 ground 161 115
## 28 troops 46 28
## 29 influence 64 55
## 30 outcome 48 43
## 31 frenetic 3 3
## 32 end 501 342
## 33 race 470 241
## 34 originally 26 25
## 35 thought 217 161
## 36 cakewalk 2 2
## 37 martha 27 20
## 38 coakley 24 10
## 39 democratic 729 311
## 40 attorney 144 95
## 41 general 359 223
## 42 massachusetts 226 112
## 43 overwhelmingly 25 22
## 44 polls 210 126
## 45 showed 163 132
## 46 scott 137 103
## 47 brown 175 69
## 48 republican 2627 632
## 49 senator 808 331
## 50 closed 101 78
# the most frequently used words
word.freqs.ordered <- word.freqs[order(-word.freqs$term.freq), ]
head(word.freqs.ordered)
## words term.freq doc.freq
## 304 abortion 4148 640
## 661 women 3961 709
## 282 planned 3268 1285
## 283 parenthood 3173 1276
## 48 republican 2627 632
## 24 state 2436 621
# optimize hyperparameters every 20 iterations, after 50 burn-in iterations
topic.model$setAlphaOptimization(20, 50)
# now train a model, specifying the number of iterations
topic.model$train(100)
# get the probability of topics in documents and the probability of words in topics; by default the functions return word counts, so to get the probabilities we can normalize and add smoothing, in order to ensure that nothing has a probability of exactly 0
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
# what are the top words in topic 5?
mallet.top.words(topic.model, topic.words[4,])
## words weights
## 1 republican 0.015374162
## 2 campaign 0.014888537
## 3 voters 0.012210661
## 4 party 0.012030286
## 5 women 0.011919286
## 6 trump 0.008339534
## 7 political 0.008325659
## 8 presidential 0.008311784
## 9 candidates 0.008145284
## 10 conservative 0.008117534
# create a vector containing short names for the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
## [1] "women sex children education school"
## [2] "people planned parenthood group made"
## [3] "obama romney president ryan tax"
## [4] "republican campaign voters party women"
## [5] "life abortion pro choice public"
## [6] "parenthood planned family city years"
## [7] "love work black time street"
## [8] "abortion women state law court"
## [9] "house republican republicans senate bill"
## [10] "parenthood planned health women care"
# show the first few document titles with at least .25 of its content devoted to topic 1
head(pp.2010$TITLE[ doc.topics[1,] > 0.25 ],10)
## [1] Paid Notice: Deaths SMITH, RUTH PROSKAUER
## [2] Anti-Abortion Billboards On Race Split Atlanta
## [3] What Every Girl Should Know
## [4] Campaigning for Adoption as Common Ground in Abortion Debate
## [5] Health Plans Must Provide Some Tests At No Cost
## [6] Birth Control Doesn't Have to Mean the Pill
## [7] Paid Notice: Deaths MACLEAN, HELENE (NEE GARDEN)
## [8] Paid Notice: Deaths TYKULSKER, ANNA, ROSE BERNSTEIN
## [9] Paid Notice
## [10] Corrections
## 3791 Levels: 'CONSCIENCE' OF CONSERVATIVES GOES ON THE ATTACK ...
# create a vector that has the title of the most representative text for each topic
topics.articles <- rep("", n.topics)
for (i in 1:n.topics) topics.articles[i] <- paste(pp.2010[which.max(doc.topics[i, ]), ]$TITLE)
# weirdly, many of the topics have the same text that is most representative
topics.articles
## [1] "Paid Notice: Deaths SMITH, RUTH PROSKAUER"
## [2] "Ruth P. Smith, 102; Abortion-Rights Pioneer"
## [3] "After Long Decline, Teenage Pregnancy Rate Rises"
## [4] "After Long Decline, Teenage Pregnancy Rate Rises"
## [5] "After Long Decline, Teenage Pregnancy Rate Rises"
## [6] "The Candidates, and Supporters From All Over, Push to the Finish Line"
## [7] "After Arrest, Provocateur's Tactics Are Questioned"
## [8] "After Arrest, Provocateur's Tactics Are Questioned"
## [9] "After Arrest, Provocateur's Tactics Are Questioned"
## [10] "From High Jinks to Handcuffs"
# now let's look at how topics differ across different years?
topic.words.2009 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2009, smoothed=T, normalized=T)
topic.words.2010 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2010, smoothed=T, normalized=T)
topic.words.2011 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2011, smoothed=T, normalized=T)
topic.words.2012 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2012, smoothed=T, normalized=T)
topic.words.2013 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2013, smoothed=T, normalized=T)
topic.words.2014 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2014, smoothed=T, normalized=T)
topic.words.2015 <- mallet.subset.topic.words(topic.model, pp.2010$year == 2015, smoothed=T, normalized=T)
topics.labels.2010 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2010[topic] <- paste(mallet.top.words(topic.model, topic.words.2010[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2010
## [1] "sex education women abstinence pregnancy"
## [2] "people made group planned times"
## [3] "president military ryan million pay"
## [4] "campaign conservative political brown republican"
## [5] "abortion godfrey training life fellowship"
## [6] "family life planned parenthood died"
## [7] "keefe black wetmore institute university"
## [8] "abortion abortions women doctors law"
## [9] "bank republican giannoulias mcmahon house"
## [10] "planned parenthood organization health care"
topics.labels.2011 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2011[topic] <- paste(mallet.top.words(topic.model, topic.words.2011[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2011
## [1] "sex women school education children"
## [2] "people planned news parenthood group"
## [3] "obama president romney government cuts"
## [4] "voters republican conservative party political"
## [5] "life abortion choice public city"
## [6] "city school planned parenthood family"
## [7] "keefe vernacchio man street asked"
## [8] "abortion state law abortions women"
## [9] "house republicans republican democrats spending"
## [10] "parenthood planned health services federal"
topics.labels.2012 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2012[topic] <- paste(mallet.top.words(topic.model, topic.words.2012[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2012
## [1] "women sex children birth control"
## [2] "people planned group made media"
## [3] "romney obama president ryan santorum"
## [4] "women campaign republican voters party"
## [5] "life abortion pro choice public"
## [6] "parenthood university school planned city"
## [7] "street young work love good"
## [8] "abortion women state law court"
## [9] "republican house senate republicans democrats"
## [10] "planned parenthood komen health women"
topics.labels.2013 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2013[topic] <- paste(mallet.top.words(topic.model, topic.words.2013[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2013
## [1] "women percent sex young pregnancy"
## [2] "people group made planned day"
## [3] "obama president tax administration romney"
## [4] "campaign voters party republican women"
## [5] "abortion life pro question public"
## [6] "church city planned family brooklyn"
## [7] "gilbert time love play richards"
## [8] "abortion state women texas abortions"
## [9] "bill republican senate house republicans"
## [10] "cancer planned parenthood health women"
topics.labels.2014 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2014[topic] <- paste(mallet.top.words(topic.model, topic.words.2014[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2014
## [1] "women sex children school data"
## [2] "people planned group day made"
## [3] "obama president government insurance million"
## [4] "women voters republican campaign election"
## [5] "abortion life gomperts buffer pro"
## [6] "planned years sage ny parenthood"
## [7] "dunham book film woman time"
## [8] "abortion women court law state"
## [9] "republican senate democrats republicans house"
## [10] "parenthood health planned care women"
topics.labels.2015 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2015[topic] <- paste(mallet.top.words(topic.model, topic.words.2015[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2015
## [1] "women sex children percent family"
## [2] "people planned nytimes parenthood html"
## [3] "obama president government tax people"
## [4] "trump republican clinton bush presidential"
## [5] "life abortion pro shooting police"
## [6] "parenthood planned years family wife"
## [7] "street year black work guns"
## [8] "abortion law court state women"
## [9] "house republican republicans senate boehner"
## [10] "parenthood planned tissue health videos"
# vectorize them
t.2010 <- as.vector(topics.labels.2010)
t.2011 <- as.vector(topics.labels.2011)
t.2012 <- as.vector(topics.labels.2012)
t.2013 <- as.vector(topics.labels.2013)
t.2014 <- as.vector(topics.labels.2014)
t.2015 <- as.vector(topics.labels.2015)
# view all the topics as they change over the years
topics.over.time <- cbind(t.2010, t.2011, t.2012, t.2013, t.2014, t.2015)
# look at each topic individually -- the first topic over the years
topics.over.time[1, ]
## t.2010
## "sex education women abstinence pregnancy"
## t.2011
## "sex women school education children"
## t.2012
## "women sex children birth control"
## t.2013
## "women percent sex young pregnancy"
## t.2014
## "women sex children school data"
## t.2015
## "women sex children percent family"
# the second!
topics.over.time[2, ]
## t.2010
## "people made group planned times"
## t.2011
## "people planned news parenthood group"
## t.2012
## "people planned group made media"
## t.2013
## "people group made planned day"
## t.2014
## "people planned group day made"
## t.2015
## "people planned nytimes parenthood html"
# the third
topics.over.time[3, ]
## t.2010
## "president military ryan million pay"
## t.2011
## "obama president romney government cuts"
## t.2012
## "romney obama president ryan santorum"
## t.2013
## "obama president tax administration romney"
## t.2014
## "obama president government insurance million"
## t.2015
## "obama president government tax people"
# the fourth
topics.over.time[4, ]
## t.2010
## "campaign conservative political brown republican"
## t.2011
## "voters republican conservative party political"
## t.2012
## "women campaign republican voters party"
## t.2013
## "campaign voters party republican women"
## t.2014
## "women voters republican campaign election"
## t.2015
## "trump republican clinton bush presidential"
# the fifth
topics.over.time[5, ]
## t.2010
## "abortion godfrey training life fellowship"
## t.2011
## "life abortion choice public city"
## t.2012
## "life abortion pro choice public"
## t.2013
## "abortion life pro question public"
## t.2014
## "abortion life gomperts buffer pro"
## t.2015
## "life abortion pro shooting police"
# the sixth
topics.over.time[6, ]
## t.2010
## "family life planned parenthood died"
## t.2011
## "city school planned parenthood family"
## t.2012
## "parenthood university school planned city"
## t.2013
## "church city planned family brooklyn"
## t.2014
## "planned years sage ny parenthood"
## t.2015
## "parenthood planned years family wife"
# the seventh
topics.over.time[7, ]
## t.2010
## "keefe black wetmore institute university"
## t.2011
## "keefe vernacchio man street asked"
## t.2012
## "street young work love good"
## t.2013
## "gilbert time love play richards"
## t.2014
## "dunham book film woman time"
## t.2015
## "street year black work guns"
# the eighth
topics.over.time[8, ]
## t.2010
## "abortion abortions women doctors law"
## t.2011
## "abortion state law abortions women"
## t.2012
## "abortion women state law court"
## t.2013
## "abortion state women texas abortions"
## t.2014
## "abortion women court law state"
## t.2015
## "abortion law court state women"
# the ninth
topics.over.time[9, ]
## t.2010
## "bank republican giannoulias mcmahon house"
## t.2011
## "house republicans republican democrats spending"
## t.2012
## "republican house senate republicans democrats"
## t.2013
## "bill republican senate house republicans"
## t.2014
## "republican senate democrats republicans house"
## t.2015
## "house republican republicans senate boehner"
# the tenth
topics.over.time[10, ]
## t.2010
## "planned parenthood organization health care"
## t.2011
## "parenthood planned health services federal"
## t.2012
## "planned parenthood komen health women"
## t.2013
## "cancer planned parenthood health women"
## t.2014
## "parenthood health planned care women"
## t.2015
## "parenthood planned tissue health videos"
We can represent this relationship visually, as follows:
# with the wordcloud package
topic.num <- 1
num.top.words<-100
topic.top.words <- mallet.top.words(topic.model, topic.words[1,], 100)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F, colors = "red", ordered.colors = T)
num.topics<-10
num.top.words<-25
for(i in 1:num.topics){
topic.top.words <- mallet.top.words(topic.model, topic.words[i,], num.top.words)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F, colors = "red", ordered.colors = T)
}
And creating a cluster dendogram.
# from http://www.cs.princeton.edu/~mimno/R/clustertrees.R
# transpose and normalize the doc topics
topic.docs <- t(doc.topics)
topic.docs <- topic.docs / rowSums(topic.docs)
write.csv(topic.docs, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Results/topic-docs.csv")
# Get a vector containing short names for the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,],
num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
## [1] "women sex children education school"
## [2] "people planned parenthood group made"
## [3] "obama romney president ryan tax"
## [4] "republican campaign voters party women"
## [5] "life abortion pro choice public"
## [6] "parenthood planned family city years"
## [7] "love work black time street"
## [8] "abortion women state law court"
## [9] "house republican republicans senate bill"
## [10] "parenthood planned health women care"
write.csv(topics.labels, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Results/topic-labels.csv")
# create data.frame with columns as docs and rows as topics
topic_docs <- data.frame(topic.docs)
names(topic_docs) <- pp.2010$id
# cluster based on shared words
plot(hclust(dist(topic.words)), labels=topics.labels)
Now we can complete this analysis on a larger subset of the data, from 1982 to the present.
set.seed(12345)
# load the libraries we will need for this section
library(mallet) # a wrapper around the Java machine learning tool MALLET
library(wordcloud) # to visualize wordclouds
# subset the data for 1982 and later, the dates for which we have the complete data
pp.1982 <- subset(pp, pp$year > 1982)
# we first have to create an 'id' column
pp.1982$id <- rownames(pp.1982)
# remove punctuation
pp.1982$TEXT <- gsub(pattern="[[:punct:]]",replacement=" ", pp.1982$TEXT)
# load data into mallet
mallet.instances <- mallet.import(pp.1982$id, pp.1982$TEXT, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Data/stoplist.csv", FALSE, token.regexp="[\\p{L}']+")
# decide what number of topics to model
n.topics = 10
# create a topic trainer object.
topic.model <- MalletLDA(n.topics)
# load our documents
topic.model$loadDocuments(mallet.instances)
# get the vocabulary, and some statistics about word frequencies. these may be useful in further curating the stopword list.
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
# examine some of the vocabulary
word.freqs[1:50,]
## words term.freq doc.freq
## 1 richard 76 63
## 2 schweiker 26 8
## 3 secretary 511 334
## 4 health 5600 1696
## 5 human 1064 603
## 6 services 2391 1105
## 7 today 2099 948
## 8 recommended 111 99
## 9 rule 707 329
## 10 requiring 367 264
## 11 family 3902 1504
## 12 planning 1954 763
## 13 clinics 2772 844
## 14 supported 481 391
## 15 federal 3839 1300
## 16 money 2250 932
## 17 notify 118 90
## 18 parents 1291 529
## 19 minors 151 99
## 20 receive 458 363
## 21 birth 2013 808
## 22 control 2153 968
## 23 pills 560 174
## 24 diaphragms 35 27
## 25 intrauterine 71 53
## 26 devices 155 102
## 27 planned 7738 3773
## 28 parenthood 7559 3807
## 29 federation 1041 803
## 30 america 1607 1096
## 31 threaten 45 45
## 32 teen 1160 295
## 33 agers 592 195
## 34 families 607 376
## 35 immediately 256 233
## 36 moved 476 371
## 37 district 948 515
## 38 court 7542 1201
## 39 block 433 299
## 40 implementation 22 17
## 41 grounds 194 163
## 42 violated 146 118
## 43 statutes 80 53
## 44 constitution 550 232
## 45 guarantee 125 102
## 46 invasion 26 24
## 47 privacy 412 228
## 48 approved 554 338
## 49 office 1472 813
## 50 management 261 189
# the most frequently used words
word.freqs.ordered <- word.freqs[order(-word.freqs$term.freq), ]
head(word.freqs.ordered)
## words term.freq doc.freq
## 98 abortion 16465 1952
## 149 women 10390 2042
## 27 planned 7738 3773
## 28 parenthood 7559 3807
## 38 court 7542 1201
## 560 state 6152 1642
# optimize hyperparameters every 20 iterations, after 50 burn-in iterations.
topic.model$setAlphaOptimization(20, 50)
# now train a model. Note that hyperparameter optimization is on, by default. We can specify the number of iterations. Here we'll use a large-ish round number.
topic.model$train(100)
# get the probability of topics in documents and the probability of words in topics; by default the functions return word counts, so to get the probabilities we can normalize and add smoothing, in order to ensure that nothing has a probability of exactly 0
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
# what are the top words in topic 7?
mallet.top.words(topic.model, topic.words[6,])
## words weights
## 1 women 0.020910376
## 2 people 0.007577855
## 3 time 0.007009778
## 4 life 0.006357724
## 5 years 0.005532777
## 6 world 0.005088195
## 7 american 0.004964700
## 8 percent 0.004806627
## 9 make 0.004243490
## 10 work 0.003813727
# Get a vector containing short names for the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
## [1] "health sex family school education"
## [2] "planned university parenthood family school"
## [3] "republican president campaign party political"
## [4] "abortion women abortions planned parenthood"
## [5] "people told police time day"
## [6] "women people time life years"
## [7] "street tickets art avenue center"
## [8] "year million company money years"
## [9] "bill house federal government planned"
## [10] "court law justice abortion supreme"
# show the first few document titles with at least .25 of its content devoted to topic 1
head(pp.1982$TITLE[ doc.topics[1,] > 0.25 ],10)
## [1] New Birth Control Rule Near
## [2] A LEGISLATIVE BATTLE IN PENNSYLVANIA
## [3] THE NATION; Birth-Control Rule Challenged
## [4] THE SQUEAL RULE'
## [5] NEW YORK DAY BY DAY
## [6] ABOUT NEW YORK
## [7] COURT REAFFIRMS RIGHT TO ABORTION AND BARS VARIETY OF LOCAL CURBS; Excerpts from Akron case, page B10.
## [8] THE NATION; ABortion Foes Overreach in The Senate
## [9] BRIEFING
## [10] PLANNED PARENTHOOD DELETED FROM FEDERAL CHARITY DRIVE
## 3791 Levels: 'CONSCIENCE' OF CONSERVATIVES GOES ON THE ATTACK ...
# create a vector that has the title of the most representative text for each topic
topics.articles <- rep("", n.topics)
for (i in 1:n.topics) topics.articles[i] <- paste(pp.1982[which.max(doc.topics[i, ]), ]$TITLE)
# weirdly, many of the topics have the same text that is most representative
topics.articles
## [1] "New Birth Control Rule Near"
## [2] "A LEGISLATIVE BATTLE IN PENNSYLVANIA"
## [3] "BIRTH-CONTROL RULE: CLINICS PONDER EFFECTS"
## [4] "BIRTH-CONTROL RULE: CLINICS PONDER EFFECTS"
## [5] "A LEGISLATIVE BATTLE IN PENNSYLVANIA"
## [6] "U.S. TO REQUIRE NOTICE TO PARENTS IF CHILDREN RECEIVE CONTRACEPTIVES"
## [7] "DEMONSTRATIONS MARK 10 YEARS OF LEGAL ABORTION"
## [8] "ABORTION RULING: 10 YEARS OF BITTER CONFLICT"
## [9] "A LEGISLATIVE BATTLE IN PENNSYLVANIA"
## [10] "New Birth Control Rule Near"
# now let's look at how topics differ across different years?
topic.words.1983 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1983, smoothed=T, normalized=T)
topic.words.1984 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1984, smoothed=T, normalized=T)
topic.words.1985 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1985, smoothed=T, normalized=T)
topic.words.1986 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1986, smoothed=T, normalized=T)
topic.words.1987 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1987, smoothed=T, normalized=T)
topic.words.1988 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1988, smoothed=T, normalized=T)
topic.words.1989 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1989, smoothed=T, normalized=T)
topic.words.1990 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1990, smoothed=T, normalized=T)
topic.words.1991 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1991, smoothed=T, normalized=T)
topic.words.1992 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1992, smoothed=T, normalized=T)
topic.words.1993 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1993, smoothed=T, normalized=T)
topic.words.1994 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1994, smoothed=T, normalized=T)
topic.words.1995 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1995, smoothed=T, normalized=T)
topic.words.1996 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1996, smoothed=T, normalized=T)
topic.words.1997 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1997, smoothed=T, normalized=T)
topic.words.1998 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1998, smoothed=T, normalized=T)
topic.words.1999 <- mallet.subset.topic.words(topic.model, pp.1982$year == 1999, smoothed=T, normalized=T)
topic.words.2000 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2000, smoothed=T, normalized=T)
topic.words.2001 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2001, smoothed=T, normalized=T)
topic.words.2002 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2002, smoothed=T, normalized=T)
topic.words.2003 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2003, smoothed=T, normalized=T)
topic.words.2004 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2004, smoothed=T, normalized=T)
topic.words.2005 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2005, smoothed=T, normalized=T)
topic.words.2006 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2006, smoothed=T, normalized=T)
topic.words.2007 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2007, smoothed=T, normalized=T)
topic.words.2008 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2008, smoothed=T, normalized=T)
topic.words.2009 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2009, smoothed=T, normalized=T)
topic.words.2010 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2010, smoothed=T, normalized=T)
topic.words.2011 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2011, smoothed=T, normalized=T)
topic.words.2012 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2012, smoothed=T, normalized=T)
topic.words.2013 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2013, smoothed=T, normalized=T)
topic.words.2014 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2014, smoothed=T, normalized=T)
topic.words.2015 <- mallet.subset.topic.words(topic.model, pp.1982$year == 2015, smoothed=T, normalized=T)
topics.labels.1983 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1983[topic] <- paste(mallet.top.words(topic.model, topic.words.1983[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1984 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1984[topic] <- paste(mallet.top.words(topic.model, topic.words.1984[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1985 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1985[topic] <- paste(mallet.top.words(topic.model, topic.words.1985[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1986 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1986[topic] <- paste(mallet.top.words(topic.model, topic.words.1986[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1987 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1987[topic] <- paste(mallet.top.words(topic.model, topic.words.1987[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1988 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1988[topic] <- paste(mallet.top.words(topic.model, topic.words.1988[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1989 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1989[topic] <- paste(mallet.top.words(topic.model, topic.words.1989[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1990 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1990[topic] <- paste(mallet.top.words(topic.model, topic.words.1990[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1991 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1991[topic] <- paste(mallet.top.words(topic.model, topic.words.1991[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1992 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1992[topic] <- paste(mallet.top.words(topic.model, topic.words.1992[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1993 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1993[topic] <- paste(mallet.top.words(topic.model, topic.words.1993[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1994 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1994[topic] <- paste(mallet.top.words(topic.model, topic.words.1994[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1995 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1995[topic] <- paste(mallet.top.words(topic.model, topic.words.1995[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1996 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1996[topic] <- paste(mallet.top.words(topic.model, topic.words.1996[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1997 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1997[topic] <- paste(mallet.top.words(topic.model, topic.words.1997[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1998 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1998[topic] <- paste(mallet.top.words(topic.model, topic.words.1998[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.1999 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.1999[topic] <- paste(mallet.top.words(topic.model, topic.words.1999[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2000 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2000[topic] <- paste(mallet.top.words(topic.model, topic.words.2000[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2001 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2001[topic] <- paste(mallet.top.words(topic.model, topic.words.2001[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2002 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2002[topic] <- paste(mallet.top.words(topic.model, topic.words.2002[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2003 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2003[topic] <- paste(mallet.top.words(topic.model, topic.words.2003[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2004 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2004[topic] <- paste(mallet.top.words(topic.model, topic.words.2004[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2005 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2005[topic] <- paste(mallet.top.words(topic.model, topic.words.2005[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2006 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2006[topic] <- paste(mallet.top.words(topic.model, topic.words.2006[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2007 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2007[topic] <- paste(mallet.top.words(topic.model, topic.words.2007[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2008 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2008[topic] <- paste(mallet.top.words(topic.model, topic.words.2008[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2009 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2009[topic] <- paste(mallet.top.words(topic.model, topic.words.2009[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2010 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2010[topic] <- paste(mallet.top.words(topic.model, topic.words.2010[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2011 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2011[topic] <- paste(mallet.top.words(topic.model, topic.words.2011[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2012 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2012[topic] <- paste(mallet.top.words(topic.model, topic.words.2012[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2013 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2013[topic] <- paste(mallet.top.words(topic.model, topic.words.2013[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2014 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2014[topic] <- paste(mallet.top.words(topic.model, topic.words.2014[topic,], num.top.words=5)$words, collapse=" ")
topics.labels.2015 <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels.2015[topic] <- paste(mallet.top.words(topic.model, topic.words.2015[topic,], num.top.words=5)$words, collapse=" ")
# vectorize them
t.1983 <- as.vector(topics.labels.1983)
t.1984 <- as.vector(topics.labels.1984)
t.1985 <- as.vector(topics.labels.1985)
t.1986 <- as.vector(topics.labels.1986)
t.1987 <- as.vector(topics.labels.1987)
t.1988 <- as.vector(topics.labels.1988)
t.1989 <- as.vector(topics.labels.1989)
t.1990 <- as.vector(topics.labels.1990)
t.1991 <- as.vector(topics.labels.1991)
t.1992 <- as.vector(topics.labels.1992)
t.1993 <- as.vector(topics.labels.1993)
t.1994 <- as.vector(topics.labels.1994)
t.1995 <- as.vector(topics.labels.1995)
t.1996 <- as.vector(topics.labels.1996)
t.1997 <- as.vector(topics.labels.1997)
t.1998 <- as.vector(topics.labels.1998)
t.1999 <- as.vector(topics.labels.1999)
t.2001 <- as.vector(topics.labels.2001)
t.2002 <- as.vector(topics.labels.2002)
t.2003 <- as.vector(topics.labels.2003)
t.2004 <- as.vector(topics.labels.2004)
t.2005 <- as.vector(topics.labels.2005)
t.2000 <- as.vector(topics.labels.2000)
t.2001 <- as.vector(topics.labels.2001)
t.2002 <- as.vector(topics.labels.2002)
t.2003 <- as.vector(topics.labels.2003)
t.2004 <- as.vector(topics.labels.2004)
t.2005 <- as.vector(topics.labels.2005)
t.2006 <- as.vector(topics.labels.2006)
t.2007 <- as.vector(topics.labels.2007)
t.2008 <- as.vector(topics.labels.2008)
t.2009 <- as.vector(topics.labels.2009)
t.2010 <- as.vector(topics.labels.2010)
t.2011 <- as.vector(topics.labels.2011)
t.2012 <- as.vector(topics.labels.2012)
t.2013 <- as.vector(topics.labels.2013)
t.2014 <- as.vector(topics.labels.2014)
t.2015 <- as.vector(topics.labels.2015)
# create a matrix with all the topics over time
topics.over.time <- cbind(t.1983, t.1984, t.1985, t.1986, t.1987, t.1988, t.1989, t.1990, t.1991, t.1992, t.1993, t.1994, t.1995, t.1996, t.1997, t.1998, t.1999, t.2000, t.2001, t.2002, t.2003, t.2004, t.2005, t.2006, t.2007, t.2008, t.2009, t.2010, t.2011, t.2012, t.2013, t.2014, t.2015)
Now we can look at how the topics have changed over the years, to see if Planned Parenthood has become a more politicized issue over time, or perhaps during certain election cycles.
# look at each topic individually -- the first topic over the years
topics.over.time[1, ]
## t.1983
## "family parents school sex health"
## t.1984
## "family county planning teen health"
## t.1985
## "family teen program health services"
## t.1986
## "teen agers education school family"
## t.1987
## "teen school sex aids family"
## t.1988
## "family health program planning teen"
## t.1989
## "teen aids children wattleton parents"
## t.1990
## "aids teen education parenthood sex"
## t.1991
## "family program teen school aids"
## t.1992
## "aids teen children school year"
## t.1993
## "school teen program pregnant year"
## t.1994
## "teen school health family education"
## t.1995
## "health care catholic city services"
## t.1996
## "aids teen sex school children"
## t.1997
## "care health family county teen"
## t.1998
## "family health catholic children services"
## t.1999
## "health condom sex school education"
## t.2000
## "sex parents school education children"
## t.2001
## "pedreira education children services religious"
## t.2002
## "health services city cancer children"
## t.2003
## "sex sexual teenagers school city"
## t.2004
## "sex health education girls school"
## t.2005
## "sex education parents children school"
## t.2006
## "sex health school children sexual"
## t.2007
## "sex abstinence education health funds"
## t.2008
## "health age sex care young"
## t.2009
## "sex health education parents young"
## t.2010
## "sex education abstinence health pregnancy"
## t.2011
## "sex health education school services"
## t.2012
## "komen health cancer breast catholic"
## t.2013
## "cancer breast health sex percent"
## t.2014
## "sex health school family children"
## t.2015
## "sex health planned sexually family"
# the second!
topics.over.time[2, ]
## t.1983
## "parenthood university college planned chairman"
## t.1984
## "university family planned board school"
## t.1985
## "university family years planned church"
## t.1986
## "college canfield university planned school"
## t.1987
## "president university planned school late"
## t.1988
## "parenthood university planned president school"
## t.1989
## "university parenthood planned years school"
## t.1990
## "university planned years family father"
## t.1991
## "university years church school college"
## t.1992
## "university college president planned degrees"
## t.1993
## "president university daughter father school"
## t.1994
## "college school parenthood planned years"
## t.1995
## "planned parenthood university college plimpton"
## t.1996
## "family father husband planned university"
## t.1997
## "planned parenthood family board lichtenstein"
## t.1998
## "university planned family parenthood college"
## t.1999
## "parenthood university planned died nytimes"
## t.2000
## "dyson planned parenthood university nytimes"
## t.2001
## "planned family parenthood board wife"
## t.2002
## "board wife planned parenthood university"
## t.2003
## "leon levy planned parenthood board"
## t.2004
## "planned parenthood ny died school"
## t.2005
## "board family school rabbi planned"
## t.2006
## "died school university hartford family"
## t.2007
## "university planned parenthood college family"
## t.2008
## "mott planned parenthood church esther"
## t.2009
## "years family university school served"
## t.2010
## "university planned family parenthood life"
## t.2011
## "planned parenthood school law family"
## t.2012
## "planned parenthood university college family"
## t.2013
## "family planned parenthood college mother"
## t.2014
## "sage ny planned years family"
## t.2015
## "planned university parenthood family wife"
# the third
topics.over.time[3, ]
## t.1983
## "reagan campaign national president vote"
## t.1984
## "reagan president political issues national"
## t.1985
## "president national reagan rights campaign"
## t.1986
## "president national campaign reagan committee"
## t.1987
## "president conservative campaign political reagan"
## t.1988
## "bush president dukakis political state"
## t.1989
## "rights president political national bush"
## t.1990
## "president political campaign schroeder issue"
## t.1991
## "bush president republican political groups"
## t.1992
## "bush president campaign republican political"
## t.1993
## "campaign clinton rights president political"
## t.1994
## "clinton president rights groups republican"
## t.1995
## "foster nomination republican president political"
## t.1996
## "campaign republican clinton dole president"
## t.1997
## "president clinton campaign political election"
## t.1998
## "political mccaughey campaign pataki republican"
## t.1999
## "governor president bush political republican"
## t.2000
## "bush campaign gore president republican"
## t.2001
## "bush president political campaign party"
## t.2002
## "forrester bush voters lautenberg republican"
## t.2003
## "bush president campaign republican schwarzenegger"
## t.2004
## "bush president democratic party conservative"
## t.2005
## "president senator republican bush democrats"
## t.2006
## "bush president republican senator rights"
## t.2007
## "republican president campaign rights giuliani"
## t.2008
## "party political president republican bush"
## t.2009
## "obama president campaign conservative bush"
## t.2010
## "conservative campaign political republican national"
## t.2011
## "republican party conservative president republicans"
## t.2012
## "romney obama republican campaign president"
## t.2013
## "republican party campaign voters democratic"
## t.2014
## "republican voters democratic campaign election"
## t.2015
## "trump republican party clinton campaign"
# the fourth
topics.over.time[4, ]
## t.1983
## "abortion women clinics health planned"
## t.1984
## "abortion women abortions planning family"
## t.1985
## "abortion women clinics abortions clinic"
## t.1986
## "abortion clinics women medical abortions"
## t.1987
## "abortion women planned abortions clinics"
## t.1988
## "abortion women abortions clinics planning"
## t.1989
## "abortion women abortions planned parenthood"
## t.1990
## "abortion women abortions parenthood planned"
## t.1991
## "abortion clinics women planning abortions"
## t.1992
## "abortion women abortions clinics planned"
## t.1993
## "abortion women abortions clinics clinic"
## t.1994
## "abortion women clinic clinics planned"
## t.1995
## "abortion abortions clinics women doctors"
## t.1996
## "abortion women abortions planned parenthood"
## t.1997
## "abortion women abortions medical planned"
## t.1998
## "abortion abortions women doctors anti"
## t.1999
## "abortion doctors women drug patients"
## t.2000
## "abortion women abortions drug doctors"
## t.2001
## "abortion clinics planned parenthood women"
## t.2002
## "abortion women parenthood planned pregnancy"
## t.2003
## "abortion women abortions drug pills"
## t.2004
## "abortion women drug records medical"
## t.2005
## "abortion women drug abortions health"
## t.2006
## "abortion women abortions drug planned"
## t.2007
## "abortion women health planned abortions"
## t.2008
## "abortion health women planned abortions"
## t.2009
## "abortion women tiller abortions health"
## t.2010
## "abortion women abortions doctors parenthood"
## t.2011
## "abortion women abortions parenthood planned"
## t.2012
## "abortion women parenthood planned abortions"
## t.2013
## "abortion women abortions clinics health"
## t.2014
## "abortion women clinics abortions medical"
## t.2015
## "abortion parenthood planned women fetal"
# the fifth
topics.over.time[5, ]
## t.1983
## "day people back called asked"
## t.1984
## "violence clinic people years year"
## t.1985
## "people clinic bombings bureau called"
## t.1986
## "police people bomb man clinic"
## t.1987
## "malvasi police fire people life"
## t.1988
## "people day rescue police man"
## t.1989
## "people fulghum time day navratilova"
## t.1990
## "people told rescue police operation"
## t.1991
## "police film people home members"
## t.1992
## "people rescue operation ireland told"
## t.1993
## "mero people rescue mother gunn"
## t.1994
## "people clinic rescue police violence"
## t.1995
## "salvi clinic police people violence"
## t.1996
## "kassindja salvi people men mark"
## t.1997
## "bombing clinic men people year"
## t.1998
## "people slepian book ross buffalo"
## t.1999
## "kopp people man smith time"
## t.2000
## "people book comfort told back"
## t.2001
## "people kopp authorities called back"
## t.2002
## "police fire day baby people"
## t.2003
## "kopp people juska pictures slepian"
## t.2004
## "alberto jasmine child people girl"
## t.2005
## "people told book man day"
## t.2006
## "people father police told buffalo"
## t.2007
## "arias told people home year"
## t.2008
## "book shirt people grand hammer"
## t.2009
## "case people day clinic agents"
## t.2010
## "keefe wetmore black people office"
## t.2011
## "keefe vernacchio man people told"
## t.2012
## "people told day time kimbrough"
## t.2013
## "people told mother baby day"
## t.2014
## "dunham people book time woman"
## t.2015
## "people shooting gun police colorado"
# the sixth
topics.over.time[6, ]
## t.1983
## "women sponge life years time"
## t.1984
## "population women world united countries"
## t.1985
## "women population world men time"
## t.1986
## "women time life genetic people"
## t.1987
## "women population world years people"
## t.1988
## "women world population percent men"
## t.1989
## "women people american life years"
## t.1990
## "women years life time people"
## t.1991
## "women people years disabled time"
## t.1992
## "women people time life make"
## t.1993
## "women louise time population people"
## t.1994
## "women people population time world"
## t.1995
## "women people life time percent"
## t.1996
## "women men gay life people"
## t.1997
## "women people life time long"
## t.1998
## "women time american world life"
## t.1999
## "women sponge world life time"
## t.2000
## "women people life nytimes american"
## t.2001
## "mcgreevey schundler women work people"
## t.2002
## "women people men percent time"
## t.2003
## "women people time life work"
## t.2004
## "women american people life years"
## t.2005
## "women people years good time"
## t.2006
## "women time people life world"
## t.2007
## "women percent people years research"
## t.2008
## "women people time editor age"
## t.2009
## "women time people world life"
## t.2010
## "women make people nietzsche time"
## t.2011
## "women people social time american"
## t.2012
## "women time people years life"
## t.2013
## "women time percent people years"
## t.2014
## "women time people work make"
## t.2015
## "women people time life pro"
# the seventh
topics.over.time[7, ]
## t.1983
## "harlem street city town shop"
## t.1984
## "center sale art arts tremaine"
## t.1985
## "papp street theater festival park"
## t.1986
## "street avenue thrift shop east"
## t.1987
## "street east manhattan artists avenue"
## t.1988
## "graffiti hepburn city manhattan street"
## t.1989
## "street manhattan tickets park space"
## t.1990
## "menninger museum hudson street river"
## t.1991
## "street tickets benefit dinner avenue"
## t.1992
## "street tickets benefit fashion show"
## t.1993
## "tickets street rock benefit music"
## t.1994
## "tickets street benefit dinner avenue"
## t.1995
## "street tickets kitchen dinner benefit"
## t.1996
## "hall tickets museum house film"
## t.1997
## "art norman city museum center"
## t.1998
## "tickets street hours avenue july"
## t.1999
## "tickets benefit june street dinner"
## t.2000
## "island raven summer guida hamptons"
## t.2001
## "island today street hanssen center"
## t.2002
## "homeless street square park shelter"
## t.2003
## "westchester street film space photography"
## t.2004
## "street house french cooking food"
## t.2005
## "club village zoning rochelle church"
## t.2006
## "thompson tenants artists street art"
## t.2007
## "street hours center road hyder"
## t.2008
## "hall staten house island white"
## t.2009
## "street songs petrusich east music"
## t.2010
## "street art apartment de house"
## t.2011
## "street east music band petrusich"
## t.2012
## "street vaughn city brooklyn east"
## t.2013
## "church brooklyn queens attends thompson"
## t.2014
## "art dance street goldwyn city"
## t.2015
## "street art show friday saturday"
# the eighth
topics.over.time[8, ]
## t.1983
## "year fund years charity company"
## t.1984
## "rockefeller year million fund money"
## t.1985
## "bours business profit year years"
## t.1986
## "company advertising agency million year"
## t.1987
## "nonprofit people million company year"
## t.1988
## "volunteer million year work company"
## t.1989
## "year fund money organization million"
## t.1990
## "company year companies corporate planned"
## t.1991
## "fees year people million members"
## t.1992
## "year united million money planned"
## t.1993
## "company year time years work"
## t.1994
## "year million job years business"
## t.1995
## "year parenthood work money groups"
## t.1996
## "year company fauziya cut people"
## t.1997
## "carey year company people money"
## t.1998
## "company million johnson fund year"
## t.1999
## "million company money year companies"
## t.2000
## "million companies corzine industry money"
## t.2001
## "anthrax group aarp fund waagner"
## t.2002
## "agency nelson company year building"
## t.2003
## "letters software company million convio"
## t.2004
## "year company cookies fund letter"
## t.2005
## "year money business years planned"
## t.2006
## "year years work buffett company"
## t.2007
## "million money foundation fund company"
## t.2008
## "million year group ivins grand"
## t.2009
## "ivins anthrax year years letters"
## t.2010
## "bank planned year organization parenthood"
## t.2011
## "year money beck million business"
## t.2012
## "foundation planned parenthood million people"
## t.2013
## "gilbert year million business corporate"
## t.2014
## "million year years company parenthood"
## t.2015
## "planned business year parenthood people"
# the ninth
topics.over.time[9, ]
## t.1983
## "federal congress planned budget bill"
## t.1984
## "administration policy government federal states"
## t.1985
## "federal administration aid congress money"
## t.1986
## "money federal state planned bill"
## t.1987
## "federal administration government policy congress"
## t.1988
## "federal funds money government administration"
## t.1989
## "bill government legislation house bills"
## t.1990
## "bill house governor roemer veto"
## t.1991
## "bill federal house congress government"
## t.1992
## "federal government bill president house"
## t.1993
## "administration bill care health federal"
## t.1994
## "federal house insurance care coverage"
## t.1995
## "bill house federal health president"
## t.1996
## "bill federal planned house congress"
## t.1997
## "bill money federal house davis"
## t.1998
## "bill house legislation president congress"
## t.1999
## "bill house state president legislation"
## t.2000
## "bill spending texas health coverage"
## t.2001
## "federal administration bill money house"
## t.2002
## "bill administration government health house"
## t.2003
## "bill senate health house measure"
## t.2004
## "government federal administration house health"
## t.2005
## "government santorum house bill president"
## t.2006
## "bill richards house government plan"
## t.2007
## "bill federal health state house"
## t.2008
## "care health bill obama proposed"
## t.2009
## "bill house health insurance senate"
## t.2010
## "federal planned parenthood health money"
## t.2011
## "house federal bill government republicans"
## t.2012
## "obama romney ryan health president"
## t.2013
## "texas bill state house republican"
## t.2014
## "texas state davis health care"
## t.2015
## "planned parenthood house government republican"
# the tenth
topics.over.time[10, ]
## t.1983
## "court judge rule decision federal"
## t.1984
## "court law case washburn state"
## t.1985
## "court state supreme law case"
## t.1986
## "court abortion state roe decision"
## t.1987
## "court law judge bork supreme"
## t.1988
## "court federal abortion judge decision"
## t.1989
## "court abortion roe law supreme"
## t.1990
## "court judge abortion souter supreme"
## t.1991
## "court law supreme abortion case"
## t.1992
## "court justice abortion law state"
## t.1993
## "court law abortion justice rights"
## t.1994
## "court souter law justice abortion"
## t.1995
## "court law abortion federal case"
## t.1996
## "court rehnquist law justice case"
## t.1997
## "court suicide state assisted supreme"
## t.1998
## "court law abortion roe judge"
## t.1999
## "court justice law blackmun abortion"
## t.2000
## "court law nebraska abortion state"
## t.2001
## "court connor law abortion federal"
## t.2002
## "court case supreme law judge"
## t.2003
## "court law case bowers abortion"
## t.2004
## "justice court law blackmun abortion"
## t.2005
## "court judge justice law alito"
## t.2006
## "court justice judge alito law"
## t.2007
## "court stevens justice law abortion"
## t.2008
## "law court federal judge jury"
## t.2009
## "court justice judge law abortion"
## t.2010
## "court law supreme case justice"
## t.2011
## "law court state abortion supreme"
## t.2012
## "court law justice supreme state"
## t.2013
## "court law supreme justice case"
## t.2014
## "court law supreme state justice"
## t.2015
## "court law case supreme justice"
The ninth topic is especially interesting—it appears to track scandals or politicized issues that Planned Parenthood is embroiled in. To take a peek at how that has changed over the years, we can see that in 1985, the most common words in the topic were “bours public office called investigation.” In 1993, they were “death suicide public told office.” In 1999, they were “kopp smith web death site.” In 2006, they were “death kline group found called.” In 2012, they were “told video case web kimbrough.” And in 2015, they were “tissue fetal video planned people.” We can also trace changes in the topic that’s about the Supreme Court, in the topic that’s about election, in the topic that’s about sex education—these prove to be very informative topics from which we can build interesting additional research questions!
We can also represent this topics visually, as follows:
# with the wordcloud package
topic.num <- 1
num.top.words<-100
topic.top.words <- mallet.top.words(topic.model, topic.words[1,], 100)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F, colors = "red", ordered.colors = T)
num.topics<-10
num.top.words<-25
for(i in 1:num.topics){
topic.top.words <- mallet.top.words(topic.model, topic.words[i,], num.top.words)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F, colors = "red", ordered.colors = T)
}
And create a cluster dendogram.
# from http://www.cs.princeton.edu/~mimno/R/clustertrees.R
# transpose and normalize the doc topics
topic.docs <- t(doc.topics)
topic.docs <- topic.docs / rowSums(topic.docs)
write.csv(topic.docs, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Results/topic-docs2.csv")
# Get a vector containing short names for the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,],
num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
## [1] "health sex family school education"
## [2] "planned university parenthood family school"
## [3] "republican president campaign party political"
## [4] "abortion women abortions planned parenthood"
## [5] "people told police time day"
## [6] "women people time life years"
## [7] "street tickets art avenue center"
## [8] "year million company money years"
## [9] "bill house federal government planned"
## [10] "court law justice abortion supreme"
write.csv(topics.labels, "/Users/elizabeth/Documents/Berkeley/PS239T/ps239T-final-project/Results/topic-labels2.csv")
# create data.frame with columns as docs and rows as topics
topic_docs <- data.frame(topic.docs)
names(topic_docs) <- pp.1982$id
# cluster based on shared words
plot(hclust(dist(topic.words)), labels=topics.labels)